Load packages
library(ISLR)
library(tidyverse)
library(plotly)
Load data. Data in NCI60 is 6,830 gene expressions measurements on 64 cancer cell lines. Each cell line is labelled with a cancer type which is specified in labs.
nci_labs <- NCI60$labs
nci_data <- NCI60$data
dim(nci_data)
## [1] 64 6830
nci_labs
## [1] "CNS" "CNS" "CNS" "RENAL" "BREAST"
## [6] "CNS" "CNS" "BREAST" "NSCLC" "NSCLC"
## [11] "RENAL" "RENAL" "RENAL" "RENAL" "RENAL"
## [16] "RENAL" "RENAL" "BREAST" "NSCLC" "RENAL"
## [21] "UNKNOWN" "OVARIAN" "MELANOMA" "PROSTATE" "OVARIAN"
## [26] "OVARIAN" "OVARIAN" "OVARIAN" "OVARIAN" "PROSTATE"
## [31] "NSCLC" "NSCLC" "NSCLC" "LEUKEMIA" "K562B-repro"
## [36] "K562A-repro" "LEUKEMIA" "LEUKEMIA" "LEUKEMIA" "LEUKEMIA"
## [41] "LEUKEMIA" "COLON" "COLON" "COLON" "COLON"
## [46] "COLON" "COLON" "COLON" "MCF7A-repro" "BREAST"
## [51] "MCF7D-repro" "BREAST" "NSCLC" "NSCLC" "NSCLC"
## [56] "MELANOMA" "BREAST" "BREAST" "MELANOMA" "MELANOMA"
## [61] "MELANOMA" "MELANOMA" "MELANOMA" "MELANOMA"
table(nci_labs)
## nci_labs
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA
## 7 5 7 1 1 6
## MCF7A-repro MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE
## 1 1 8 9 6 2
## RENAL UNKNOWN
## 9 1
nci_data <- as_tibble(nci_data)
nci_data$cancer_type <- nci_labs
nci_data <- nci_data %>%
select(cancer_type, everything())
Each observation is a cancer cell iine, each variable is a gene expression measurement. Standardise each of the gene expressions before calculating the principal components.
Cancer types are generally clustered together on the charts. This suggests that cell lines from the same cancer type tend to have similar gene expression levels. For example, look at melanoma in blue, colon in brown, and leukemia in pink.
pr_out <- nci_data %>%
select(-cancer_type) %>%
prcomp(scale = TRUE)
pr_vectors <- pr_out$x %>%
as_tibble()
pr_vectors$cancer_type <- nci_labs
chart_pc1_2 <- ggplot(pr_vectors, aes(x = PC1, y = PC2, colour = cancer_type)) +
geom_point()
chart_pc1_3 <- ggplot(pr_vectors, aes(x = PC1, y = PC3, colour = cancer_type)) +
geom_point()
ggplotly(chart_pc1_2)
ggplotly(chart_pc1_3)
Produce scree plot for pca. This shows the proportion of variance explained by each princiapl component. It looks like the proportion of variance explained by principal components beyond 6 or 7 drops off, so there might not be much value in examining principal components beyond 6 or 7.
pca_pve <- summary(pr_out)$importance
pca_pve <- pca_pve %>%
t() %>%
as_tibble()
names(pca_pve) <- c("st_dev", "pve", "cumulative_pve")
pca_pve$principal_component <- 1:64
plot_pve <- ggplot(pca_pve, aes(principal_component, pve)) +
geom_point() +
geom_line()
plot_cmt_pve <- ggplot(pca_pve, aes(principal_component, cumulative_pve)) +
geom_point() +
geom_line()
ggplotly(plot_pve)
ggplotly(plot_cmt_pve)
Produce dendograms of hierarchical clustering using 3 different linkage methods. Different linkage methods give very different clusterings. Single linkage gives trailing clusters with very large clusters where individual observations are joined one at a time. Complete and average linkage methods produce more balanced clusters.
nci_data_num <- nci_data %>%
select(-cancer_type) %>%
scale()
row.names(nci_data_num) <- nci_labs
hc_df <- tibble("method" = c("complete", "average", "single")) %>%
mutate(data = list(nci_data_num),
euc_dist = purrr::map(data, dist),
hc = purrr::map2(euc_dist, method, hclust))
plot_dgram <- function(hc, method_nm){
plot(hc, main = method_nm, xlab ="", sub = "", cex = .9)
}
hc_df <- hc_df %>%
mutate(dgram = purrr::map2(hc, method, plot_dgram)) %>%
select(-dgram)
Using complete linkage, assign each observation to one of four clusters. All leukemia cells are in cluster 3, all melanoma in cluster 1. So it looks like the gene expressions of cancer cells of the same cancer type are similar for some types of cancer.
extr_cluster_hc <- function(hc){
tibble("cluster" = as.character(cutree(hc, k = 4)))
}
cancer_type <- tibble("cancer_type" = nci_labs)
hc_df <- hc_df %>%
mutate(cluster = purrr::map(hc, extr_cluster_hc)) %>%
filter(method == "complete")
hcluster_4 <- cancer_type %>%
bind_cols(hc_df$cluster[[1]])
table(hcluster_4)
## cluster
## cancer_type 1 2 3 4
## BREAST 2 3 0 2
## CNS 3 2 0 0
## COLON 2 0 0 5
## K562A-repro 0 0 1 0
## K562B-repro 0 0 1 0
## LEUKEMIA 0 0 6 0
## MCF7A-repro 0 0 0 1
## MCF7D-repro 0 0 0 1
## MELANOMA 8 0 0 0
## NSCLC 8 1 0 0
## OVARIAN 6 0 0 0
## PROSTATE 2 0 0 0
## RENAL 8 1 0 0
## UNKNOWN 1 0 0 0
chart_cluster <- ggplot(hcluster_4, aes(cancer_type, fill = cluster)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplotly(chart_cluster)
Carry out hierarchical clustering on the first 6 principal components as we saw in 1. PCA that there was not much gain in the proportion of variance explained from adding principal components beyond 6.
hc_pca6_out <- hclust(dist(pr_out$x[, 1:6]))
plot(hc_pca6_out, labels = nci_labs, main = "Hierarchical clustering on the first 6 principal components", xlab = "", sub = "")
cluster <- tibble("cluster" = as.character(cutree(hc_pca6_out, 4)))
hcluster_4_pca <- cancer_type %>%
bind_cols(cluster)
table(hcluster_4_pca)
## cluster
## cancer_type 1 2 3 4
## BREAST 3 2 0 2
## CNS 5 0 0 0
## COLON 0 7 0 0
## K562A-repro 0 0 1 0
## K562B-repro 0 0 1 0
## LEUKEMIA 0 2 4 0
## MCF7A-repro 0 0 0 1
## MCF7D-repro 0 0 0 1
## MELANOMA 0 8 0 0
## NSCLC 1 8 0 0
## OVARIAN 0 6 0 0
## PROSTATE 0 2 0 0
## RENAL 3 6 0 0
## UNKNOWN 0 1 0 0
chart_cluster <- ggplot(hcluster_4_pca, aes(cancer_type, fill = cluster)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplotly(chart_cluster)